RNP v2

In [1]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../../')
#import Datanalytics as da 
from JKBio import TerraFunction as terra
%load_ext autoreload
%autoreload 2
from JKBio import Helper as h

import pickle
from taigapy import TaigaClient
tc = TaigaClient()
import numpy as np
import itertools

from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
import matplotlib.pyplot as plt
%load_ext rpy2.ipython
import seaborn as sns
import gseapy
import matplotlib.pyplot as plt
import networkx as nx
from JKBio.helper import pyDESeq2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering, DBSCAN

from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
Loading BokehJS ...

getting data

In [ ]:
! gsutil mv gs://transfer-amlproject/*MP7624* gs://transfer-amlproject/RNPv2/
In [ ]:
! gsutil -m cp -r gs://transfer-amlproject/RNPv3 gs://amlproject/RNA/
In [ ]:
! gsutil ls gs://amlproject/
In [ ]:
sampleset='RNPv3'
In [ ]:
terra.uploadFromFolder('amlproject','RNPv2/',
                       'broad-firecloud-ccle/hg38_RNAseq',samplesetname=sampleset,
                      fformat="fastqR1R2", sep='_MP7624')

Processing

In [ ]:
wm = dm.WorkspaceManager('broad-firecloud-ccle/hg38_RNAseq')
In [ ]:
submission_id = wm.create_submission("star_v1-0_BETA_cfg", sampleset, 'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
In [ ]:
submission_id = wm.create_submission("rsem_v1-0_BETA_cfg", 
                                      sampleset,'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
In [ ]:
submission_id = wm.create_submission("rsem_aggregate_results_v1-0_BETA_cfg", 
                                         sampleset)
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
In [ ]:
results = wm.get_sample_sets().loc[sampleset]
rsem_genes_expected_count = results['rsem_genes_expected_count']
In [ ]:
results
In [ ]:
mkdir ../../data/RNPv3

Loading

In [ ]:
! gsutil cp $rsem_genes_expected_count ../../data/RNPv3/
In [ ]:
file = '../../data/RNPv3/'+rsem_genes_expected_count.split('/')[-1]
In [ ]:
file
In [ ]:
! gunzip $file
In [ ]:
file
In [ ]:
rsem_genes_expected_count = pd.read_csv(file[:-3], sep='\t')
In [2]:
rsem_genes_expected_count = pd.read_csv("../../data/RNPv3/RNPv3.rsem_genes_expected_count.txt", sep='\t')
In [3]:
data = rsem_genes_expected_count.drop("transcript_id(s)",1)
In [4]:
data["gene_id"] = h.convertGenes(data['gene_id'])[0]
you need access to taiga for this (https://pypi.org/project/taigapy/)
20702 could not be parsed... we don't have all genes already
In [5]:
data=data.set_index('gene_id')
In [6]:
data
Out[6]:
1 10 11 12 13 14 15 16 17 18 ... 67 68 69 7 70 71 72 73 8 9
gene_id
TSPAN6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
TNMD 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
DPM1 1619.00 2465.00 1701.00 1535.00 1863.00 2093.00 2027.00 2202.00 2148.00 2235.00 ... 1620.00 1840.00 1729.00 1983.00 1926.0 1846.00 1915.00 2633.00 2451.00 2378.00
SCYL3 464.57 846.12 672.69 603.75 577.41 617.97 601.43 545.49 575.14 536.97 ... 430.78 460.04 437.36 542.42 572.5 507.48 580.49 713.56 670.02 576.38
C1orf112 780.43 1031.90 755.31 676.25 1232.70 1209.00 1309.60 1370.50 1245.90 1257.10 ... 949.22 1277.00 1032.60 1163.60 783.5 1088.50 1184.50 1572.40 1481.00 1332.90
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 3.00 5.00 8.00 2.00 2.00 1.00 2.00 1.00 3.00 3.00 ... 1.00 1.00 5.00 1.00 6.0 3.00 3.00 4.00 2.00 4.00
ERCC-00165 215.00 594.00 424.00 509.00 136.00 88.00 165.00 258.00 161.00 163.00 ... 93.00 139.00 87.00 127.00 628.0 207.00 151.00 241.00 187.00 176.00
ERCC-00168 3.00 12.00 9.00 8.00 0.00 8.00 0.00 5.00 5.00 1.00 ... 3.00 4.00 1.00 3.00 8.0 5.00 4.00 7.00 8.00 3.00
ERCC-00170 66.00 205.00 133.00 211.00 57.00 40.00 73.00 94.00 42.00 40.00 ... 41.00 56.00 33.00 50.00 141.0 72.00 92.00 110.00 89.00 88.00
ERCC-00171 13554.00 40900.00 29090.00 33242.00 10039.00 6399.00 10836.00 15684.00 9526.00 8893.00 ... 7058.00 7576.00 5882.00 8381.00 47913.0 12046.00 10447.00 17316.00 10492.00 12389.00

58813 rows × 73 columns

In [7]:
rename = {"1": "mr120-MV411-RNP_IRF2BP2-r4",
"2": "mr121-MV411-RNP_IRF2BP2-r5",
"3": "mr122-MV411-RNP_IRF2BP2-r6",
"4": "mr123-MV411-RNP_IRF8-r4",
"5": "mr124-MV411-RNP_IRF8-r5",
"6": "mr125-MV411-RNP_IRF8-r6",
"7": "mr126-MV411-RNP_MEF2D-r4",
"8": "mr127-MV411-RNP_MEF2D-r5",
"9": "mr128-MV411-RNP_MEF2D-r6",
"10": "mr129-MV411-RNP_MYC-r4",
"11": "mr130-MV411-RNP_MYC-r5",
"12": "mr131-MV411-RNP_MYC-r6",
"13": "mr132-MV411-RNP_RUNX1-r4",
"14": "mr133-MV411-RNP_RUNX1-r5",
"15": "mr134-MV411-RNP_RUNX1-r6",
"16": "mr135-MV411-RNP_RUNX2-r4",
"17": "mr136-MV411-RNP_RUNX2-r5",
"18": "mr137-MV411-RNP_RUNX2-r6",
"19": "mr138-MV411-RNP_SPI1-r4",
"20": "mr139-MV411-RNP_SPI1-r5",
"21": "mr140-MV411-RNP_SPI1-r6",
"22": "mr141-MV411-RNP_ZMYND8-r4",
"23": "mr142-MV411-RNP_ZMYND8-r5",
"24": "mr143-MV411-RNP_ZMYND8-r6",
"25": "mr144-MV411-RNP_LMO2-r4",
"26": "mr145-MV411-RNP_LMO2-r5",
"27": "mr146-MV411-RNP_LMO2-r6",
"28": "mr147-MV411-RNP_LYL1-r4",
"29": "mr148-MV411-RNP_LYL1-r5",
"30": "mr149-MV411-RNP_LYL1-r6",
"31": "mr150-MV411-RNP_MAX-r4",
"32": "mr151-MV411-RNP_MAX-r5",
"33": "mr152-MV411-RNP_MAX-r6",
"34": "mr153-MV411-RNP_ZEB2-r4",
"35": "mr154-MV411-RNP_ZEB2-r5",
"36": "mr155-MV411-RNP_ZEB2-r6",
"37": "mr156-MV411-RNP_MEF2C-r4",
"38": "mr157-MV411-RNP_MEF2C-r5",
"39": "mr158-MV411-RNP_MEF2C-r6",
"40": "mr159-MV411-RNP_MEIS1-r4",
"41": "mr160-MV411-RNP_MEIS1-r5",
"42": "mr161-MV411-RNP_MEIS1-r6",
"43": "mr162-MV411-RNP_FLI1-r4",
"44": "mr163-MV411-RNP_FLI1-r5",
"45": "mr164-MV411-RNP_FLI1-r6",
"46": "mr165-MV411-RNP_ELF2-r4",
"47": "mr166-MV411-RNP_ELF2-r5",
"48": "mr167-MV411-RNP_ELF2-r6",
"49": "mr168-MV411-RNP_GFI1-r4",
"50": "mr169-MV411-RNP_GFI1-r5",
"51": "mr170-MV411-RNP_GFI1-r6",
"52": "mr171-MV411-RNP_IKZF1-r4",
"53": "mr172-MV411-RNP_IKZF1-r5",
"54": "mr173-MV411-RNP_IKZF1-r6",
"55": "mr174-MV411-RNP_CEBPA-r4",
"56": "mr175-MV411-RNP_CEBPA-r5",
"57": "mr176-MV411-RNP_CEBPA-r6",
"58": "mr177-MV411-RNP_MYB-r4",
"59": "mr178-MV411-RNP_MYB-r5",
"60": "mr179-MV411-RNP_MYB-r6",
"61": "mr180-MV411-RNP_MYBL2-r1",
"62": "mr181-MV411-RNP_MYBL2-r2",
"63": "mr182-MV411-RNP_MYBL2-r3",
"64": "mr183-MV411-RNP_HOXA9-r4",
"65": "mr184-MV411-RNP_HOXA9-r5",
"66": "mr185-MV411-RNP_HOXA9-r6",
"67": "mr186-MV411-RNP_AAVS1-r1",
"68": "mr187-MV411-RNP_AAVS1-r2",
"69": "mr188-MV411-RNP_AAVS1-r3",
"70": "mr189-MV411-RNP_SP1-r4",
"71": "mr190-MV411-RNP_SP1-r5",
"72": "mr191-MV411-RNP_SP1-r6",
"73": "mr192-MV411-RNP_SP1-r7"}
In [8]:
data.columns
Out[8]:
Index(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2',
       '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30',
       '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41',
       '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52',
       '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63',
       '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '8',
       '9'],
      dtype='object')
In [9]:
data.columns = [rename[i] for i in data.columns]
In [10]:
data
Out[10]:
mr120-MV411-RNP_IRF2BP2-r4 mr129-MV411-RNP_MYC-r4 mr130-MV411-RNP_MYC-r5 mr131-MV411-RNP_MYC-r6 mr132-MV411-RNP_RUNX1-r4 mr133-MV411-RNP_RUNX1-r5 mr134-MV411-RNP_RUNX1-r6 mr135-MV411-RNP_RUNX2-r4 mr136-MV411-RNP_RUNX2-r5 mr137-MV411-RNP_RUNX2-r6 ... mr186-MV411-RNP_AAVS1-r1 mr187-MV411-RNP_AAVS1-r2 mr188-MV411-RNP_AAVS1-r3 mr126-MV411-RNP_MEF2D-r4 mr189-MV411-RNP_SP1-r4 mr190-MV411-RNP_SP1-r5 mr191-MV411-RNP_SP1-r6 mr192-MV411-RNP_SP1-r7 mr127-MV411-RNP_MEF2D-r5 mr128-MV411-RNP_MEF2D-r6
gene_id
TSPAN6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
TNMD 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
DPM1 1619.00 2465.00 1701.00 1535.00 1863.00 2093.00 2027.00 2202.00 2148.00 2235.00 ... 1620.00 1840.00 1729.00 1983.00 1926.0 1846.00 1915.00 2633.00 2451.00 2378.00
SCYL3 464.57 846.12 672.69 603.75 577.41 617.97 601.43 545.49 575.14 536.97 ... 430.78 460.04 437.36 542.42 572.5 507.48 580.49 713.56 670.02 576.38
C1orf112 780.43 1031.90 755.31 676.25 1232.70 1209.00 1309.60 1370.50 1245.90 1257.10 ... 949.22 1277.00 1032.60 1163.60 783.5 1088.50 1184.50 1572.40 1481.00 1332.90
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 3.00 5.00 8.00 2.00 2.00 1.00 2.00 1.00 3.00 3.00 ... 1.00 1.00 5.00 1.00 6.0 3.00 3.00 4.00 2.00 4.00
ERCC-00165 215.00 594.00 424.00 509.00 136.00 88.00 165.00 258.00 161.00 163.00 ... 93.00 139.00 87.00 127.00 628.0 207.00 151.00 241.00 187.00 176.00
ERCC-00168 3.00 12.00 9.00 8.00 0.00 8.00 0.00 5.00 5.00 1.00 ... 3.00 4.00 1.00 3.00 8.0 5.00 4.00 7.00 8.00 3.00
ERCC-00170 66.00 205.00 133.00 211.00 57.00 40.00 73.00 94.00 42.00 40.00 ... 41.00 56.00 33.00 50.00 141.0 72.00 92.00 110.00 89.00 88.00
ERCC-00171 13554.00 40900.00 29090.00 33242.00 10039.00 6399.00 10836.00 15684.00 9526.00 8893.00 ... 7058.00 7576.00 5882.00 8381.00 47913.0 12046.00 10447.00 17316.00 10492.00 12389.00

58813 rows × 73 columns

post processing and filtering

filter some more

In [11]:
toremove = np.argwhere(data.values.var(1)==0)
toremove.ravel()
Out[11]:
array([    1,    15,    24, ..., 58714, 58715, 58718])
In [12]:
toremove.shape
Out[12]:
(19991, 1)
In [13]:
data = data.drop(data.iloc[toremove.ravel()].index,0)
In [14]:
data.shape
Out[14]:
(38787, 73)
In [15]:
ERCC = data[~data.index.str.contains('ENSG00')]
In [16]:
ensg = data[data.index.str.contains('ENSG00')]
In [17]:
data = data[~data.index.str.contains('ENSG00')]

renormalize the data

In [ ]:
len(ERCC)
In [ ]:
ERCC

Loading the CRC members

In [18]:
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
Out[18]:
['ARID2',
 'CEBPA',
 'CEBPE',
 'E2F3',
 'FLI1',
 'FOSL2',
 'GFI1',
 'GFI1B',
 'HHEX',
 'IRF8',
 'LYL1',
 'MEF2C',
 'MEF2D',
 'MEIS1',
 'MTF1',
 'MYB',
 'MYC',
 'PLAGL2',
 'RUNX1',
 'RUNX2',
 'RXRA',
 'SETDB1',
 'SNAPC5',
 'SP1',
 'SPI1',
 'SREBF1',
 'STAT5B',
 'TERF2',
 'TFAP4',
 'ZEB2',
 'ZFPM1',
 'ZMYND8',
 'LMO2',
 'MAX',
 'ELF2',
 'ETV6',
 'HOXA9',
 'GATA2']

Making and running the dashboard

In [ ]:
%%R
library('erccdashboard')
In [ ]:
ERCC = ERCC.astype(int)
In [ ]:
ERCC['Feature'] = ERCC.index
In [ ]:
sns.heatmap(np.log2(ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SP1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values / ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SP1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values.mean(0)+1))
In [ ]:
experiments = list(set([i.split('-')[2] for i in ERCC.columns[:-1]]))
experiments.remove("RNP_AAVS1")
In [ ]:
#TODO: compute the mass from concentration
###################################################
### code chunk number 3: defineInputData
###################################################
%R datType = "count" # "count" for RNA-Seq data, "array" for microarray data
%R isNorm = F # flag to indicate if input expression measures are already normalized, default is FALSE 
%R filenameRoot = "RNPv2" # user defined filename prefix for results files
%R sample2Name = "AAAVS1" # name for sample 2 in the experiment
%R erccmix = "Single" # name of ERCC mixture design, "RatioPair" is default
%R erccdilution = 1/100 # dilution factor used for Ambion spike-in mixtures
%R spikeVol = 1 # volume (in microliters) of diluted spike-in mixture added to total RNA mass
%R choseFDR = 0.1 # user defined false discovery rate (FDR), default is 0.05
In [ ]:
cols = list(ERCC.columns)
cols.sort()
res={}
for val in experiments:
    d = {}
    e=0
    d.update({
        'Feature':'Feature'
    })
    for i in cols[:-1]:
        if val+'-' in i:
            e+=1
            d.update({i: val.split('_')[-1]+'_'+str(e)})
    d.update({
        'mr186-MV411-RNP_AAVS1-r1': 'AAAVS1_1',
        'mr187-MV411-RNP_AAVS1-r2': 'AAAVS1_2',
        'mr188-MV411-RNP_AAVS1-r3': 'AAAVS1_3'
    })
    a = ERCC[list(d.keys())].rename(columns=d)
    a.to_csv('../data/ERCC_estimation.csv', index=None)
    val = val.split('_')[-1]
    
    torm = 'RNPv2.'+val+'.AAAVS1.All.Pvals.csv'
    ! rm $torm 
    %R -i val print(val)
    %R print(sample2Name)
    %R a <- read.csv('../data/ERCC_estimation.csv')
    %R print(head(a))
    %R exDat = ''
    %R totalRNAmass <- 0.5
    try:
        %R -i val exDat = initDat(datType = datType, isNorm = isNorm, exTable = a, filenameRoot = filenameRoot, sample1Name = val, sample2Name = sample2Name, erccmix = erccmix, erccdilution = erccdilution, spikeVol = spikeVol, totalRNAmass = totalRNAmass, choseFDR = choseFDR)
        %R exDat = est_r_m(exDat)
        %R exDat = dynRangePlot(exDat)
    except Warning:
        print("failed for "+val)
        continue
    except:
        print('worked for '+val)
    %R print(summary(exDat))
    %R grid.arrange(exDat$Figures$dynRangePlot)
    %R grid.arrange(exDat$Figures$r_mPlot)
    %R grid.arrange(exDat$Figures$rangeResidPlot)
    %R -o rm rm <- exDat$Results$r_m.res$r_m.mn
    %R -o se se <- exDat$Results$r_m.res$r_m.mnse
    res[val] = (rm[0],se[0])
In [ ]:
for i, v in res.items():
    if abs(v[0]) > 3*v[1]:
        print(i, v[0])
In [ ]:
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'AAVS1' in i]].mean()
In [ ]:
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'SPI1' in i]].mean()
In [ ]:
scaling = res
In [ ]:
scaling
In [ ]:
h.dictToFileToFile(scaling,"../results/RNPv2/scaling.json")
In [19]:
scaling = h.fileToDict("../results/RNPv2/scaling.json")

Correlation analysis across replicates

In [ ]:
%matplotlib inline
ig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(data.corr(), 
            xticklabels=data.columns,
            yticklabels=data.columns, ax=ax)
In [ ]:
model = AgglomerativeClustering(n_clusters=15,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(data.corr())
ii = itertools.count(data.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
In [ ]:
data.to_csv('../results/RNPv2/counts.csv')
In [ ]:
data = pd.read_csv('../results/RNPv2/counts.csv',index_col=0)
In [ ]:
%matplotlib inline

sns.clustermap(data.corr(), figsize=(20, 20))

plt.savefig('../results/RNPv2/cluster_corr_count.pdf')
In [ ]:
data.sum().tolist()
In [ ]:
data.shape

Differential expression analysis

In [ ]:
data
In [20]:
experiments = list(set([i.split('-')[2] for i in data.columns[:-1]]))
In [21]:
experiments
Out[21]:
['RNP_GFI1',
 'RNP_MYBL2',
 'RNP_IRF2BP2',
 'RNP_MEF2D',
 'RNP_SPI1',
 'RNP_MYB',
 'RNP_RUNX1',
 'RNP_MYC',
 'RNP_ELF2',
 'RNP_MAX',
 'RNP_ZMYND8',
 'RNP_IKZF1',
 'RNP_SP1',
 'RNP_CEBPA',
 'RNP_LMO2',
 'RNP_AAVS1',
 'RNP_FLI1',
 'RNP_HOXA9',
 'RNP_MEF2C',
 'RNP_ZEB2',
 'RNP_MEIS1',
 'RNP_IRF8',
 'RNP_RUNX2',
 'RNP_LYL1']
In [22]:
experiments.remove("RNP_AAVS1")
In [23]:
data['gene_id'] = data.index
In [ ]:
data
In [ ]:
housekeeping = ["C1orf43", "CHMP2A", "EMC7", "GPI", "PSMB2", "PSMB4", "RAB7A", "REEP5", "SNRPD3", "VCP", "VPS29"]
# from https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf
In [25]:
#results = {}
for val in experiments:  
    print(val)
    design = pd.DataFrame(index=data.columns[:-1], columns=['DMSO','Target'], 
                          data=np.array([[1 if 'RNP_AAVS1' in i else 0 for i in data.columns[:-1]],[1 if val+'-' in i else 0 for i in data.columns[:-1]]]).T)
    design.index = design.index.astype(str).str.replace('-','.')
    deseq = pyDESeq2.pyDESeq2(count_matrix=data, design_matrix = design, 
                              design_formula='~DMSO + Target', gene_column="gene_id")
    if abs(scaling[val.split('_')[1]][0]) > scaling[val.split('_')[1]][1]:
        print("estimating sizeFactors for this one")
        deseq.run_estimate_size_factors(controlGenes=data.gene_id.str.contains("ERCC-"))
    elif val in results:
        continue
    deseq.run_deseq()
    deseq.get_deseq_result()
    r = deseq.deseq_result
    r.pvalue = np.nan_to_num(np.array(r.pvalue), 1)
    r.log2FoldChange = np.nan_to_num(np.array(r.log2FoldChange), 0)
    results[val] = r
RNP_GFI1
3.3.2
RNP_MYBL2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 154 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_IRF2BP2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 142 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MEF2D
3.3.2
RNP_SPI1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 142 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MYB
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 127 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_RUNX1
3.3.2
RNP_MYC
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 153 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_ELF2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 153 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MAX
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 156 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_ZMYND8
3.3.2
RNP_IKZF1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 149 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_SP1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 175 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_CEBPA
3.3.2
RNP_LMO2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 152 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_FLI1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 152 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_HOXA9
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 147 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MEF2C
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 150 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_ZEB2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 101 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MEIS1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 153 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_IRF8
3.3.2
RNP_RUNX2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 151 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_LYL1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 150 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

volcano plot with CRC members highlighted

In [ ]:
results
In [81]:
for val in experiments:
    a = h.volcano(results[val],tohighlight=ctf,title=val, maxvalue= 60, searchbox=True, showlabels=True)
    try:
        show(a)
    except RuntimeError:
        show(a)
In [ ]:
for k, val in results.items():
    val.to_csv('../results/RNPv2/deseq_'+k+".csv")
In [24]:
results = {}
des = ! ls ../results/RNPv2/deseq_RNP_*.csv
for val in des:
    results["RNP_"+val.split('RNP_')[1].split('.')[0]] = pd.read_csv(val,index_col=0)

Making the ccsv file for max

In [29]:
results.keys()
Out[29]:
dict_keys(['RNP_all', 'RNP_CEBPA', 'RNP_ELF2', 'RNP_FLI1', 'RNP_GFI1', 'RNP_HOXA9', 'RNP_IKZF1', 'RNP_IRF2BP2', 'RNP_IRF8', 'RNP_LMO2', 'RNP_LYL1', 'RNP_MAX', 'RNP_MEF2C', 'RNP_MEF2D', 'RNP_MEIS1', 'RNP_MYB', 'RNP_MYBL2', 'RNP_MYC', 'RNP_RUNX1', 'RNP_RUNX2', 'RNP_SP1', 'RNP_SPI1', 'RNP_ZEB2', 'RNP_ZMYND8'])
In [30]:
results.pop('RNP_all')
Out[30]:
RNP_CEBPA_fc_log2 RNP_CEBPA_padj RNP_CEBPA_pval RNP_ELF2_fc_log2 RNP_ELF2_padj RNP_ELF2_pval RNP_FLI1_fc_log2 RNP_FLI1_padj RNP_FLI1_pval RNP_GFI1_fc_log2 ... RNP_SP1_pval RNP_SPI1_fc_log2 RNP_SPI1_padj RNP_SPI1_pval RNP_ZEB2_fc_log2 RNP_ZEB2_padj RNP_ZEB2_pval RNP_ZMYND8_fc_log2 RNP_ZMYND8_padj RNP_ZMYND8_pval
TSPAN6 0.642130 NaN 0.930596 -0.064479 NaN 0.993022 0.424549 0.999961 0.954080 0.207821 ... 0.875304 -0.393437 NaN 9.573317e-01 0.022311 NaN 0.997585 -0.167083 NaN 0.981920
DPM1 -0.146773 0.175869 0.055432 0.109824 0.986459 0.142868 0.487813 0.940560 0.273223 0.086520 ... 0.024196 -1.578211 0.000952 1.818801e-04 0.208905 0.018236 0.003507 -0.164134 0.99984 0.026264
SCYL3 0.060378 0.701136 0.497529 0.008106 0.998610 0.924098 0.350550 0.940560 0.394458 -0.105824 ... 0.018508 -1.266968 0.003332 1.240800e-03 -0.055466 0.695996 0.516608 0.005266 0.99984 0.950385
C1orf112 0.126713 0.461042 0.242680 0.106333 0.986459 0.319820 0.597610 0.940560 0.211661 0.140322 ... 0.036461 -1.948156 0.000159 1.429172e-05 -0.052419 0.779765 0.626967 0.048705 0.99984 0.650304
FGR -0.582974 0.118835 0.031944 -0.130497 0.986459 0.639216 0.340905 0.945846 0.447361 -0.812484 ... 0.003588 -2.030338 0.000016 7.057808e-07 -0.127653 0.792324 0.646082 0.433732 0.99984 0.112956
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 -0.922723 0.662125 0.447910 -3.518530 0.986459 0.010779 -0.155868 0.999961 0.878590 -3.252890 ... 0.504492 0.345732 0.654121 6.166040e-01 -1.553886 0.401426 0.217815 -1.146495 NaN 0.305981
ERCC-00165 -0.503993 0.579337 0.354594 -1.030428 0.986459 0.054943 -0.159404 0.940560 0.209857 -0.689089 ... 0.619204 0.005076 0.965538 9.608078e-01 -0.227981 0.811671 0.674145 -0.715056 0.99984 0.185209
ERCC-00168 -0.723045 0.693688 0.487767 -0.300976 0.986871 0.732154 0.232214 0.999961 0.776989 -1.026813 ... 0.444718 0.032827 0.964837 9.597500e-01 -1.497910 0.329879 0.162756 -2.296520 0.99984 0.060384
ERCC-00170 -0.708528 0.426421 0.213389 -0.912857 0.986459 0.102280 -0.104913 0.999961 0.646929 -0.935368 ... 0.589275 -0.009492 0.965538 9.607581e-01 -0.245517 0.801773 0.660472 -0.831967 0.99984 0.135771
ERCC-00171 -0.449763 0.634579 0.415045 -0.853544 0.986459 0.110680 -0.083967 0.940560 0.298420 -0.633504 ... 0.942262 0.005329 0.953708 9.472072e-01 -0.295838 0.754667 0.592621 -0.676164 0.99984 0.208286

26672 rows × 69 columns

In [31]:
tosave = pd.DataFrame(index=results['RNP_CEBPA'].index)
for k,v in results.items():
    tosave[k+'_fc_log2'] = v.log2FoldChange
    tosave[k+'_padj'] = v.padj
    tosave[k+'_pval'] = v.pvalue
In [32]:
tosave.to_csv('../results/RNPv2/deseq_RNP_all.csv')

Looking at CRC members only

In [ ]:
ctf.extend(['IRF2BP2','MYBL2','IKZF1'])

maybe use adjusted p_value

In [ ]:
deseq = pd.DataFrame(index=ctf)
for k, val in results.items():
    deseq[k] = [i.log2FoldChange if i.pvalue<0.05 else 0 for a, i in val.loc[ctf].iterrows()]
In [ ]:
deseq
In [ ]:
fig = sns.clustermap(figsize=(25,20), cmap=plt.cm.RdYlBu, data=deseq,vmin=-1,vmax=1,xticklabels=deseq.columns, yticklabels=deseq.index)
In [ ]:
fig.savefig('../results/RNPv2/clustermap_ctf_deseq_all_scaled.pdf')
In [ ]:
deseq.columns = [i.split('_')[1] for i in deseq.columns]
deseq = deseq.loc[deseq.columns]
In [ ]:
deseq.to_csv('../results/RNPv2/deseq_CTFmat_allscaled.csv')
In [ ]:
deseq = pd.read_csv('../results/RNPv2/deseq_CTFmat.csv',index_col=0)
In [ ]:
net = nx.from_pandas_adjacency(((deseq < -0.3) | (deseq > 0.3)).T,create_using=nx.DiGraph)
In [ ]:
pos = nx.nx_agraph.graphviz_layout(net, prog="neato")
In [ ]:
colors = ['red' if deseq.loc[i[1],i[0]]> 0 else 'blue' for i in net.edges]

blue is down, red is up

In [ ]:
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True,edge_color=colors)
plt.show()
In [ ]:
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True,edge_color=colors)
plt.show()
In [ ]:
deseq[(deseq > -0.3) & (deseq < 0.3)]=0
net = nx.from_pandas_adjacency(deseq.T,create_using=nx.DiGraph)
In [ ]:
pos = nx.nx_agraph.graphviz_layout(net, prog='dot')
In [ ]:
colors = [-deseq.loc[i[1],i[0]] for i in net.edges]
In [ ]:
colors = [i/-min(colors) if i <0 else i/max(colors) for i in colors]
In [ ]:
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True, edge_color=colors,edge_cmap=plt.cm.RdYlBu)
plt.show()
In [ ]:
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True, edge_color=colors,edge_cmap=plt.cm.RdYlBu)
plt.show()

We are looking for bias in the data and the replicates

In [ ]:
col = {v:i for i, v in enumerate(set([i.split('-')[2] for i in data.columns[:-1]]))}
In [ ]:
red = PCA(2).fit_transform(data[data.columns[:-1]].T)
h.scatter(red, labels=data.columns[:-1], radi=60000, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
In [ ]:
red = PCA(30).fit_transform(data[data.columns[:-1]].T)
red = TSNE(2,4).fit_transform(red)

mr129-MYC-r4 seems weird

In [ ]:
h.scatter(red, labels=data.columns[:-1], radi=70, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
In [ ]:
pca = PCA(20)
red = pca.fit_transform(data[data.columns[:-1]].T)
In [ ]:
pca.explained_variance_ratio_

GSEA analysis

In [ ]:
data
In [ ]:
res = {}
In [ ]:
experiments
In [ ]:
res
In [ ]:
for val in experiments:
    print(val)
    totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
    cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
    if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
        print("rescaling this one")
        cols = [i for i in totest.columns if val+'-' in i]
        totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
    else:
        continue
    res[val] = gseapy.gsea(data=totest, gene_sets='WikiPathways_2013', 
                cls= cls, no_plot=False, processes=8)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
In [ ]:
with open('../data/pathways/wikipathway_RNPv2', 'wb') as f:
    pickle.dump(res,f)
In [ ]:
with open('../data/pathways/wikipathway_RNPv2','rb') as f:
    res = pickle.load(f)

Analysis on the wiki pathways geneset

In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
for val in experiments:
    res[val].res2d['Term'] = [i[3:].split('WP')[0] for i in res[val].res2d['Term']]
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
    plt.show()
In [ ]:
a = set()
for k, val in res.items():
    a.update(set(val.res2d.index))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
    for i,v in val.res2d.iterrows():
        a[i][n] = v.es
In [ ]:
res = pd.DataFrame(a, index=res.keys())
In [ ]:
res.columns = [i[3:].split('WP')[0] for i in res.columns]
In [ ]:
res.index = [i.split('_')[1] for i in res.index]
In [ ]:
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1,xticklabels=res.columns, yticklabels=res.index)
In [ ]:
res.to_csv('../results/RNPv2/wikipathway_gsea.csv')
In [ ]:
fig.savefig("../results/RNPv2/enriched_terms_scaled_gsea.pdf")
In [ ]:
res = {}

Analysis on the entire set of pathways (biopathways)

In [ ]:
for i, val in enumerate(['RNP_MYB']):
    print(val)
    totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
    cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
    if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
        print("rescaling this one")
        cols = [i for i in totest.columns if val+'-' in i]
        totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
    elif val in res:
        continue
    res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015', 
                cls= cls, no_plot=False, processes=8)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    plt.figure(i)
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
In [ ]:
for i, val in enumerate(experiments):
    print(val)
    totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
    cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
    if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
        print("rescaling this one")
        cols = [i for i in totest.columns if val+'-' in i]
        totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
    elif val in res:
        continue
    res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015', 
                cls= cls, no_plot=False, processes=8)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    plt.figure(i)
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
In [ ]:
with open('../data/pathways/GO_Biological_Process_2015_RNPv2', 'wb') as f:
    pickle.dump(res,f)
In [ ]:
with open('../data/pathways/GO_Biological_Process_2015_RNPv2','rb') as f:
    res = pickle.load(f)
In [ ]:
for i, v in res.items():
    res[i].res2d['Term'] = [i.split('(GO')[0] for i in v.res2d['Term']]

creating matrices

In [ ]:
a = set()
for k, val in res.items():
    a.update(set(val.res2d.Term))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
    for i,v in val.res2d.iterrows():
        a[v.Term][n] = v.es
res = pd.DataFrame(a, index=res.keys())
In [ ]:
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1, yticklabels=res.index ,cmap=plt.cm.RdYlBu)
In [ ]:
fig.savefig("../results/RNPv2/enriched_terms_scaled_gsea.pdf")
In [ ]:
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(res)
In [ ]:
sort = labels.argsort()

Correlation matrix across experiment on at the gene set level

In [ ]:
sns.clustermap(-res.T.corr(),cmap=plt.cm.RdYlBu,vmin=-1,vmax=1)
In [ ]:
a = h.plotCorrelationMatrix(res.values[sort], res.index[sort].tolist(), interactive=True, title="RNP2_bioproc_corr")#,colors=[labels[i] for i in sort]) 

similarity distance plot over the genesets

In [ ]:
red = PCA(2).fit_transform(res)
h.scatter(red, labels=res.index, radi=1, colors=labels, showlabels=True)
In [ ]:
red = TSNE(2,2).fit_transform(res)
h.scatter(red, labels=res.index, radi=9, colors=labels)
In [ ]:
res.to_csv('../results/RNPv2/biopathway_gsea.csv')
In [ ]:
res = pd.read_csv('../results/RNPv2/biopathway_gsea.csv',index_col=0)

Getting the correlation at the transcriptome level

In [ ]:
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
    data[i]=v.log2FoldChange
In [ ]:
model = AgglomerativeClustering(n_clusters=8,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(data.values.T)
ii = itertools.count(data.values.shape[1])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
In [ ]:
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(data.valuees)
In [ ]:
sns.clustermap(data)
In [ ]:
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation")#,colors=[labels[i] for i in sort]) 
In [ ]:
## Filtered version (set to 0 genes with low p_value)
In [ ]:
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
    v.loc[v[v.pvalue>0.01].index,"log2FoldChange"]==0
    data[i]=v.log2FoldChange
In [ ]:
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation",)
In [ ]:
mostvar = data[(~data.index.str.contains('ERCC-')) & (data.var(1)>4)]
In [ ]:
mostvar
In [ ]:
sns.clustermap(-mostvar.corr(), cmap=plt.cm.RdYlBu, vmin=-1, vmax=1)
In [ ]:
data.corr()
In [ ]: